Load Requied packages and Read in data

library(plotly)
library(dplyr)
library(readxl)
library(tidyverse)
library(caret)

Movies_raw <- read_xlsx("Bechdel.xlsx", col_names = TRUE)
Movies_test <- read_csv("test.csv", col_names = TRUE)

Data cleaning

Movies <- Movies_raw
names(Movies) <- c("year", "imdb", "budget", "domgross", "intgross", "code","budget_2013", "domgross_2013",
                   "intgross_2013", "Test")

#str(Movies)
Movies <- as.data.frame(Movies[-1, -c(11, 12)])

Movies_num <- Movies[, -2]

func <- function(x){
  as.numeric(gsub("PASS", 1, gsub("FAIL", 0, x)))
}

Movies_num <- as.data.frame(lapply(Movies_num, FUN = func))

str(Movies)
## 'data.frame':    1500 obs. of  10 variables:
##  $ year         : chr  "1974" "1982" "2008" "2011" ...
##  $ imdb         : chr  "tt0071562" "tt0084516" "tt0800241" "tt1625346" ...
##  $ budget       : chr  "13000000" "10700000" "15000000" "12000000" ...
##  $ domgross     : chr  "57300000" "74706019" "2203641" "16311571" ...
##  $ intgross     : chr  "57300000" "121706019" "6379575" "22750356" ...
##  $ code         : chr  "1974PASS" "1982PASS" "2008PASS" "2011PASS" ...
##  $ budget_2013  : chr  "61408439" "25821968" "16233845" "12428289" ...
##  $ domgross_2013: chr  "270669505" "180285645" "2384904" "16893744" ...
##  $ intgross_2013: chr  "270669505" "293709241" "6904335" "23562334" ...
##  $ Test         : chr  "PASS" "PASS" "PASS" "PASS" ...
#mdata <- melt(Movies, id=c("year"))

#Remove id column
Mov_code <- Movies[,2]
#Movies %>% modify_at(c(1,3:7,8), as.numeric)

# Remove NA from data set
Movies_num <- Movies_num[complete.cases(Movies_num), ]

# Add provit variable gross less budget (assuming the budget = real cost to produce movies)
Movies_num$domProfit <- Movies_num$domgross - Movies_num$budget
Movies_num$intProfit <- Movies_num$intgross - Movies_num$budget
Movies_num$domProfit_2013 <- Movies_num$domgross_2013 - Movies_num$budget_2013
Movies_num$intProfit_2013 <- Movies_num$intgross_2013 - Movies_num$budget_2013

# Reorder columns and remove NA
Movies_num <- Movies_num[ ,c(1, 2, 3, 4, 10, 11, 6, 7, 8, 12, 13, 9)]

# Remove NA from data set
Movies_clean <- Movies_num[complete.cases(Movies_num), ]

summary(Movies_clean)
##       year          budget             domgross        
##  Min.   :1970   Min.   :     7000   Min.   :      828  
##  1st Qu.:1998   1st Qu.: 12000000   1st Qu.: 16244304  
##  Median :2005   Median : 29000000   Median : 42704878  
##  Mean   :2003   Mean   : 45555379   Mean   : 70440928  
##  3rd Qu.:2009   3rd Qu.: 63000000   3rd Qu.: 95052518  
##  Max.   :2013   Max.   :425000000   Max.   :760507625  
##     intgross           domProfit            intProfit         
##  Min.   :8.280e+02   Min.   :-201941321   Min.   :-110450242  
##  1st Qu.:2.583e+07   1st Qu.:  -7210930   1st Qu.:   5235822  
##  Median :7.875e+07   Median :   8541081   Median :  43905789  
##  Mean   :1.545e+08   Mean   :  24885549   Mean   : 108981403  
##  3rd Qu.:1.986e+08   3rd Qu.:  42571574   3rd Qu.: 135132192  
##  Max.   :2.784e+09   Max.   : 458672302   Max.   :2358918982  
##   budget_2013        domgross_2013       intgross_2013      
##  Min.   :     8632   Min.   :8.990e+02   Min.   :8.990e+02  
##  1st Qu.: 16143738   1st Qu.:2.022e+07   1st Qu.:3.278e+07  
##  Median : 36995786   Median :5.608e+07   Median :9.933e+07  
##  Mean   : 56311299   Mean   :9.719e+07   Mean   :2.037e+08  
##  3rd Qu.: 81684071   3rd Qu.:1.255e+08   3rd Qu.:2.467e+08  
##  Max.   :461435929   Max.   :1.772e+09   Max.   :3.172e+09  
##  domProfit_2013       intProfit_2013            Test       
##  Min.   :-204897453   Min.   :-120328632   Min.   :0.0000  
##  1st Qu.:  -8798028   1st Qu.:   6347922   1st Qu.:0.0000  
##  Median :  10831363   Median :  56550302   Median :0.0000  
##  Mean   :  40879050   Mean   : 147396397   Mean   :0.4474  
##  3rd Qu.:  55015145   3rd Qu.: 169980582   3rd Qu.:1.0000  
##  Max.   :1729408181   Max.   :3024171833   Max.   :1.0000
#plot_ly(Movies, x = ~`Dom gross`, y = ~Year, type = 'scatter', mode = 'lines')
#View(Movies)

Data Analysis

# Look at the distribution of the data 
hist_l = list()
#Movies_quant <- as.data.frame(Movies[,-c(2,6)])
hist_p <- function(df){
  for(i in 1:10){
    hist_l[i] <- plot_ly(x = df[ ,i], type = "histogram", name = names(df)[i]) 
  }
  return(hist_l)
}


hist_pl <- hist_p(Movies_clean)

unadjusted <- subplot(hist_pl[[2]], hist_pl[[3]], hist_pl[[4]], hist_pl[[5]], hist_pl[[1]], nrows = 4, margin = 0.04, 
                      heights = c(0.25, 0.25, 0.25, 0.25))
adjusted <- subplot(hist_pl[[7]], hist_pl[[8]], hist_pl[[9]], nrows = 3, margin = 0.04, heights = c(0.33, 0.33, 0.33))
#year
unadjusted
adjusted

Analyis part 2

# Look at the distribution of the data 
box_l <- list()

box_p <- function(df){
  for(i in 1:11){
    box_l[i] <- plot_ly(y = df[,i], type = "box", name = names(df)[i])
  }
  return(box_l)
}

str(Movies_clean)
## 'data.frame':    1484 obs. of  12 variables:
##  $ year          : num  1974 1982 2008 2011 2000 ...
##  $ budget        : num  1.30e+07 1.07e+07 1.50e+07 1.20e+07 2.60e+07 2.00e+07 3.00e+08 8.00e+07 2.00e+06 1.60e+07 ...
##  $ domgross      : num  57300000 74706019 2203641 16311571 45506619 ...
##  $ intgross      : num  5.73e+07 1.22e+08 6.38e+06 2.28e+07 7.58e+07 ...
##  $ domProfit     : num  44300000 64006019 -12796359 4311571 19506619 ...
##  $ intProfit     : num  44300000 111006019 -8620425 10750356 49763814 ...
##  $ budget_2013   : num  61408439 25821968 16233845 12428289 35175618 ...
##  $ domgross_2013 : num  2.71e+08 1.80e+08 2.38e+06 1.69e+07 6.16e+07 ...
##  $ intgross_2013 : num  2.71e+08 2.94e+08 6.90e+06 2.36e+07 1.03e+08 ...
##  $ domProfit_2013: num  209261066 154463677 -13848941 4465455 26390668 ...
##  $ intProfit_2013: num  209261066 267887273 -9329510 11134045 67325881 ...
##  $ Test          : num  1 1 1 1 0 0 1 0 1 1 ...
box_pl <- box_p(Movies_clean)

subplot(box_pl[[2]], box_pl[[3]], box_pl[[4]], box_pl[[5]], box_pl[[6]])
subplot(box_pl[[7]], box_pl[[8]], box_pl[[9]], box_pl[[10]], box_pl[[11]])
# Percent of movies that passed test all time
percent_pass <- sum(Movies_num$Test)/length(Movies$Test) * 100

Movies_order <- Movies_clean[order(Movies_clean$year),]
Movies_order <- Movies_order[, c(1, 12)]

sum_yr <- aggregate(Movies_order$Test, by=list(Year = Movies_order$year), FUN = sum)
sum_yr$Pass <- sum_yr$x

library(plyr)
freq <- count(Movies_order, "year")

Change_percent <- cbind(sum_yr, `Percent pass Berchdel test` = (sum_yr$x/freq$freq)*100)

succes_time <- plot_ly(sum_yr, x = ~Year, y = ~Pass,
                       type = "scatter", mode = "markers+lines") 

percent_time <- plot_ly(Change_percent, x = ~Year, y = ~`Percent pass Berchdel test`, 
                        type = "scatter", mode = "markers+lines") 

subplot(percent_time, succes_time, nrows = 2)

Bassed on the binary definition 44.2666667 \(\%\) of movies passed the test.

Data Prediction

Movies_clean$Test <- factor(Movies_clean$Test, levels = c(0, 1))

str(Movies_clean)
## 'data.frame':    1484 obs. of  12 variables:
##  $ year          : num  1974 1982 2008 2011 2000 ...
##  $ budget        : num  1.30e+07 1.07e+07 1.50e+07 1.20e+07 2.60e+07 2.00e+07 3.00e+08 8.00e+07 2.00e+06 1.60e+07 ...
##  $ domgross      : num  57300000 74706019 2203641 16311571 45506619 ...
##  $ intgross      : num  5.73e+07 1.22e+08 6.38e+06 2.28e+07 7.58e+07 ...
##  $ domProfit     : num  44300000 64006019 -12796359 4311571 19506619 ...
##  $ intProfit     : num  44300000 111006019 -8620425 10750356 49763814 ...
##  $ budget_2013   : num  61408439 25821968 16233845 12428289 35175618 ...
##  $ domgross_2013 : num  2.71e+08 1.80e+08 2.38e+06 1.69e+07 6.16e+07 ...
##  $ intgross_2013 : num  2.71e+08 2.94e+08 6.90e+06 2.36e+07 1.03e+08 ...
##  $ domProfit_2013: num  209261066 154463677 -13848941 4465455 26390668 ...
##  $ intProfit_2013: num  209261066 267887273 -9329510 11134045 67325881 ...
##  $ Test          : Factor w/ 2 levels "0","1": 2 2 2 2 1 1 2 1 2 2 ...
table(Movies_clean$Test)
## 
##   0   1 
## 820 664
# Prep Training and Test data.
set.seed(50)
split_Movies <- createDataPartition(Movies_clean$Test, p=0.85, list = F) 
train <- Movies_clean[split_Movies, ]
test_split <- Movies_clean[-split_Movies, ]

table(train$Test)
## 
##   0   1 
## 697 565
set.seed(50)
scale_train_down <- downSample(x = train[, 2:11], y = train$Test, list = FALSE, yname = "Test")

table(scale_train_down$Test)
## 
##   0   1 
## 565 565
set.seed(50)
scale_train_up <- upSample(x = train[, 2:11], y = train$Test, list = FALSE, yname = "Test")
table(scale_train_up$Test)
## 
##   0   1 
## 697 697
# Look at different models 

Movies_glm <- glm(Test ~ budget + domgross + intgross + domProfit + intProfit 
                    + budget_2013 + domgross_2013 + intgross_2013 + domProfit_2013
                    + intProfit_2013, data = scale_train_up, family = binomial)

Movies_glm_2 <- glm(Test ~ budget + domgross + intgross
                    + budget_2013 + domgross_2013 + intgross_2013, 
                    data = scale_train_up, family = binomial)

Movies_glm_1 <- glm(Test ~ budget + domgross +  budget_2013 + domgross_2013, 
                    data = scale_train_up, family = binomial)

summary(Movies_glm)
## 
## Call:
## glm(formula = Test ~ budget + domgross + intgross + domProfit + 
##     intProfit + budget_2013 + domgross_2013 + intgross_2013 + 
##     domProfit_2013 + intProfit_2013, family = binomial, data = scale_train_up)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5082  -1.1906   0.4406   1.0991   2.0967  
## 
## Coefficients: (4 not defined because of singularities)
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     3.725e-01  8.514e-02   4.375 1.21e-05 ***
## budget          5.084e-09  7.227e-09   0.703 0.481793    
## domgross        2.047e-08  6.155e-09   3.325 0.000884 ***
## intgross       -8.838e-09  3.176e-09  -2.783 0.005389 ** 
## domProfit              NA         NA      NA       NA    
## intProfit              NA         NA      NA       NA    
## budget_2013    -1.111e-08  6.057e-09  -1.834 0.066653 .  
## domgross_2013  -1.693e-08  4.565e-09  -3.709 0.000208 ***
## intgross_2013   7.685e-09  2.439e-09   3.151 0.001625 ** 
## domProfit_2013         NA         NA      NA       NA    
## intProfit_2013         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1932.5  on 1393  degrees of freedom
## Residual deviance: 1877.2  on 1387  degrees of freedom
## AIC: 1891.2
## 
## Number of Fisher Scoring iterations: 4
summary(Movies_glm_1)
## 
## Call:
## glm(formula = Test ~ budget + domgross + budget_2013 + domgross_2013, 
##     family = binomial, data = scale_train_up)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4208  -1.1895   0.3918   1.1129   2.5284  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    3.244e-01  8.011e-02   4.049 5.14e-05 ***
## budget        -4.076e-09  6.595e-09  -0.618  0.53654    
## domgross       6.396e-09  2.180e-09   2.934  0.00334 ** 
## budget_2013   -3.472e-09  5.618e-09  -0.618  0.53649    
## domgross_2013 -4.209e-09  1.421e-09  -2.963  0.00305 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1932.5  on 1393  degrees of freedom
## Residual deviance: 1889.5  on 1389  degrees of freedom
## AIC: 1899.5
## 
## Number of Fisher Scoring iterations: 4
summary(Movies_glm_2) #Best model so far
## 
## Call:
## glm(formula = Test ~ budget + domgross + intgross + budget_2013 + 
##     domgross_2013 + intgross_2013, family = binomial, data = scale_train_up)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5082  -1.1906   0.4406   1.0991   2.0967  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    3.725e-01  8.514e-02   4.375 1.21e-05 ***
## budget         5.084e-09  7.227e-09   0.703 0.481793    
## domgross       2.047e-08  6.155e-09   3.325 0.000884 ***
## intgross      -8.838e-09  3.176e-09  -2.783 0.005389 ** 
## budget_2013   -1.111e-08  6.057e-09  -1.834 0.066653 .  
## domgross_2013 -1.693e-08  4.565e-09  -3.709 0.000208 ***
## intgross_2013  7.685e-09  2.439e-09   3.151 0.001625 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1932.5  on 1393  degrees of freedom
## Residual deviance: 1877.2  on 1387  degrees of freedom
## AIC: 1891.2
## 
## Number of Fisher Scoring iterations: 4
# Prediction using model
pred <- predict(Movies_glm_2, newdata = test_split, type = "response")

pred_test <- ifelse(pred > 0.5, 1, 0)
Test_pred <- factor(pred_test, levels=c(0, 1))
Test_act <- factor(test_split$Test)
mean(Test_pred == Test_act)
## [1] 0.5765766
# Predict test.csv data
Movies_test <- read_csv("test.csv", col_names = TRUE)

pred_N <- predict(Movies_glm_2, newdata = Movies_test, type = "response")
pred_test_csv <- ifelse(pred_N > 0.5, 1, 0)
Test_pred_csv <- factor(pred_test_csv, levels=c(0, 1))


Movies_New <- as.data.frame(cbind(Movies_test, Test_pred_csv))

func <- function(x){
  gsub(1, "PASS", gsub(0, "FAIL", x))
}

Movies_Pred <- as.data.frame(lapply(Movies_New$Test_pred_csv, FUN = func))
head(Movies_New)
##   year      imdb   budget domgross  intgross     code budget_2013
## 1 2013 tt1559547 5.00e+07 19452138  55940671 2013PASS    5.00e+07
## 2 2013 tt0765446 4.00e+07 57012977  66454811 2013PASS    4.00e+07
## 3 2013 tt1211956 7.00e+07 25213103 103813103 2013FAIL    7.00e+07
## 4 2013 tt1351685 1.95e+08 65187603 197387603 2013FAIL    1.95e+08
## 5 2013 tt1650554 2.80e+07 28795985  60839197 2013PASS    2.80e+07
## 6 2013 tt2002718 2.00e+06  8008161   9408161 2013PASS    2.00e+06
##   domgross_2013 intgross_2013 Test_pred_csv
## 1      19452138      55940671             1
## 2      57012977      66454811             1
## 3      25213103     103813103             0
## 4      65187603     197387603             0
## 5      28795985      60839197             1
## 6       8008161       9408161             1
save(x = Movies_Pred, file = "Movies_Pred.csv")